Note: Non-letters are sometimes illegal variable names. In the analysis, the subspecies name aus, indica, tropical japonica and temperate japonica is abbreviated as AUS, IND, TRJ and TEJ.
Note: the following script need running in Bash command line.
# Construct phylogenetic tree by by IQ-TREE v1.6.12
gunzip merge_seq.fa.gz # decompress
iqtree -s merge_seq.fa -st DNA -m TEST -bb 1000 -alrt 1000 -nt 20 -pre iqtree165 -quiet
# Generate annotation file
# Note: table2itol.R from https://github.com/mgoeker/table2itol
Rscript ../script/table2itol.R -a -d -c none -D planB -b Subspecies -i OTUID -l OTUID -t %s -w 0.5 ../data/cultivar.txt
Then the tree file is visulized in iTOL (https://itol.embl.de/). Add tree color file (iTOL_treecolors-Subspecies.txt) as annotation. Save file as 1a.Tree165SubSpecies.svg
(A) Unrooted phylogenetic tree of the 165 landrace rice varieties examined in this study based on 58,450 missense SNPs. Branch length is shown according to the scale. Branches are colored according to subspecies: aus (green), indica (orange), tropical japonica (red), and temperate japonica (blue).
This figure is hand-drawn by using Adobe Illustrator.
The plotting script following our previous study in Nature Biotechnology(doi: 10.1038/s41587-019-0104-4). Detail in https://github.com/YongxinLiu/Note/tree/master/R/format2graphlan
The data download from previous published papers, detail in References.
metadata = read.table("../data/cultivar_LN_metadata.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors=F)
# Set subspecies level
level=c("AUS","IND","TRJ","TEJ" )
metadata$group = factor(metadata$Subspecies, level)
levels(metadata$group) = c("aus","indica","tropical japonica","temperate japonica")
sub_design = metadata
m = "bray"
beta = read.table(paste("../data/cultivar_LN_otutab_norm.txt",sep=""), header=T, row.names=1, sep="\t", comment.char="")
# Extract only those samples in common between the two tables
idx = rownames(sub_design) %in% colnames(beta)
sub_design=sub_design[idx,]
sub_beta=beta[,rownames(sub_design)]
# normalize to 100
otutab = t(sub_beta)/colSums(sub_beta,na=T)*100
# Constrained analysis OTU table by genotype
capscale.gen = capscale(otutab ~ group, data=sub_design, add=F, sqrt.dist=T, distance= m)
# ANOVA-like permutation analysis
perm_anova.gen = anova.cca(capscale.gen, permutations = 1000, parallel = 9)
# generate variability tables and calculate confidence intervals for the variance
var_tbl.gen = variability_table(capscale.gen)
eig = capscale.gen$CCA$eig
variance = var_tbl.gen["constrained", "proportion"]
p.val = perm_anova.gen[1, 4]
# extract the weighted average (sample) scores
points = capscale.gen$CCA$wa
# save coordinate site
write.table("Axis\t", file=paste("1d.cpcoa.txt",sep = ""), append = F, sep="\t", quote=F, eol = "",row.names=F, col.names=F)
suppressWarnings(write.table(points, file=paste("1d.cpcoa.txt",sep = ""), append = T, sep="\t", quote=F, row.names=T, col.names=T))
points = capscale.gen$CCA$wa[, 1:2]
points = as.data.frame(points)
points = cbind(points, sub_design$group)
colnames(points) = c("PC1", "PC2", "group")
# The picture is reversed horizontally (optional)
points$PC2 = points$PC2 * -1
# plot PC 1 and 2
p = ggplot(points, aes(x=PC1, y=PC2, color=group)) + geom_point(alpha=.7, size=2) +
labs(x=paste("CPCo1 (", format(100 * eig[1] / sum(eig), digits=4), "%)", sep=""),
y=paste("CPCo2 (", format(100 * eig[2] / sum(eig), digits=4), "%)", sep="")) +
ggtitle(paste(format(100 * variance, digits=3), " % of variance; p=",format(p.val, digits=2),sep="")) +
theme_classic() + main_theme
p = p + stat_ellipse(level=0.68)
color = c("#00FF00","#FF8000", "#FF0000", "#00FFFF" )
p = p + scale_color_manual(values = color)
pggsave(paste("1d.LN_beta_cpcoa.pdf", sep=""), p, width = w*1.25, height = h*1.15, units = "mm")# reorder subspecies
points$group = factor(points$group, levels = c("temperate japonica","tropical japonica","indica","aus"))
## plot error bar plot
index = points[,c(1,3)]
index2 <- summarySE(index, measurevar = "PC1", groupvars ="group")
p=ggplot(data=index2, mapping=aes(x=group, y=PC1, fill=group)) +
geom_errorbar(aes(ymin=PC1-sd, ymax=PC1+sd), width=.3) +
geom_dotplot(binaxis='y',stackdir='center',dotsize=1, color='black')+
coord_flip() + scale_color_manual(values = color) + main_theme + ylim(-0.2, 0.2)
pggsave(paste("1d.LN_beta_cpcoa1_errorbar.pdf", sep=""), p, width = w*1.63, height = h*0.6, units = "mm")The combo figure as following
Genotype + Phenotype(Tiller number) + Root microbiota
Note: the following script need running in Bash command line.
Construct subspecies tree. Using ARO as out group.
All the related scripts in folder script.
# decompress SNP sequences
gunzip merge_clean.fa.gz
# get SNP sequences in each subspecies
for i in IND TEJ TRJ AUS ARO; do
# Select subspecies ID
grep -P "\t${i}$" ../doc/minicore_subspecies.txt | cut -f 1 > temp/subspecies_${i}.id
# get sequences by usearch v10.0
usearch10 -fastx_getseqs data/merge_clean.fa -labels temp/subspecies_${i}.id -fastaout temp/subspecies_${i}.fa
# transform multiply sequences fasta in one line format
format_fasta_1line.pl -i temp/subspecies_${i}.fa -o temp/subspecies_${i}1.fa
# get consensus sequences in each subspecies
consensus_fa.pl -i temp/subspecies_${i}1.fa -o temp/subspecies_${i}_consensus
done
cat temp/subspecies_*_consensus.fa > temp/consensus.fa
sed -i 's/subspecies_//;s/_consensus//' temp/consensus.fa
iqtree -s temp/consensus.fa -st DNA -m TEST -bb 1000 -alrt 1000 -nt 20 -pre fig1/subspecies5 -quiet -redoThen the tree file is visulized in iTOL (https://itol.embl.de/). Save file as subspecies5_branch.svg and subspecies5_branchno.svg
phenoLN = read.table("../data/cultivar_LN_pheno.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors=F)
design = read.table("../data/cultivar_LN_metadata.txt", header=T, row.names=1, sep="\t", comment.char="")
phenoLN = phenoLN[rownames(design),]
design$Subspecies = factor(design$Subspecies, levels = c("TEJ","TRJ","IND","AUS"))
# amplicon packages from github: https://github.com/microbiota/amplicon
library(amplicon)
write.table(cbind(design, phenoLN[rownames(design),]), file=paste("../data/cultivar_LN_pheno_metadata.txt", sep="_"), append = F, sep="\t", quote=F, row.names=F, col.names=T)
p = alpha_boxplot(phenoLN, design, index = "tiller", "Subspecies")
p = p + coord_flip(clip = "on")
color = c("#00FFFF", "#FF0000", "#FF8000", "#00FF00")
p = p + scale_color_manual(values = color)
pggsave(paste("1e.tiller_LN_subspecies.pdf", sep=""), p, width = 4, height =4 )library(dplyr)
metadata = read.table(paste("../data/cultivar_LN_metadata.txt",sep=""), header=T, row.names=1, sep="\t", comment.char="")
distance_type = "bray_curtis"
distance_mat = read.table(paste0("../data/cultivar_LN_",distance_type,".txt"), header=T, row.names=1, sep="\t", comment.char="")
# distance_mat[1:3, 1:3]
# amplicon packages from github: https://github.com/microbiota/amplicon
library(amplicon)
# Plotting Constrained PCoA based on distance matrix
(p = beta_pcoa(distance_mat, metadata, groupID = "Subspecies"))# Add arrow
points = p$data
arrows_pos = points %>% group_by(group) %>% summarise_all(mean)
arrows_pos = as.data.frame(arrows_pos)
arrows_pos$x_start = 0
arrows_pos$y_start = 0
p = p + geom_segment(data = arrows_pos, aes(x = x_start, y = y_start, xend = x, yend = y),
arrow = arrow(length = unit(0.2, "cm")),color="black",alpha=0.6)
p# set scale to save plot
x = 0.2557
y= 0.1284
ggsave(paste0("1e.LN_pcoa_",distance_type,"_arrow_scale.pdf"), p, width = 89, height = 89*y/x, units = "mm")PCoA plot cluster into dendrogram
# scaling by expained variance
arrows_pos$x = arrows_pos$x * x
arrows_pos$y = arrows_pos$y * y
# calculate euclidean distance
df = arrows_pos[,2:3]
rownames(df) = arrows_pos$group
dat_dist <- vegdist(df, method="euclidean")
# hclust
dat_dist_clu <- hclust(dat_dist, "average")
# plot
pdf(paste0("1e.LN_pcoa_",distance_type,"_hclust.pdf"), width=4, height=3)
plot(dat_dist_clu, main = "Hclust with euclidean", lwd=1.5, cex=.5)
dev.off()png
2
The combo figure as following
(E) Root microbiota showed the same clustering patterns as tiller number and population structure of the field-grown rice population. Left panel: the population structure of four rice subspecies generated with the consensus SNPs. Middle panel: boxplots showing the distribution of tiller number in aus, indica, tropical japonica, and temperate japonica, including tiller numbers from six individual plants per variety. Right panel: clustering of root microbiota centroid of each rice subspecies in the unconstrained PCoA (Table S4). The vertical bars within boxes represent medians. Different letters indicate significantly different groups (P < 0.05, ANOVA, Tukey’s HSD). The left and right sides of the boxes represent the 25th and 75th quartiles, respectively. The left and right whiskers extend to data within no more than 1.5 the interquartile range from the left edge and right edge of the box, respectively. The numbers of rice varieties in the figure are as follows: aus (n = 37), indica (n = 66), tropical japonica (n = 35), and temperate japonica (n = 27).
# Tiller number
phenotype = read.table("../data/cultivar_LN_meta.txt", row.names = 1,header = T)
# PCA of genotype
genotype = read.table("../data/genotype.pca.eigenvec", row.names = 1)
genotype = genotype[rownames(phenotype),2:5]
colnames(genotype) = c("PC1","PC2","PC3","PC4")
# PCoA of microbiota
pcoa = read.table("../data/cultivar_LN_bray_curtis14.txt", row.names = 1, header = T)
pcoa = pcoa[rownames(phenotype),]
colnames(pcoa) = c("PCo1","PCo2","PCo3","PCo4")
# merge phenotype, microbiota and genotype
df = cbind(phenotype[,"tiller_raw"], pcoa, genotype)
colnames(df)[1] = "tiller"
# save
suppressWarnings(write.table(df, file=paste("../data/", "cultivar_LN_tiller_M_G.data",sep = ""), append = F, sep="\t", quote=F, row.names=T, col.names=T))
# read file
a <- read.table(paste("../data/", "cultivar_LN_tiller_M_G.data",sep = ""))
fit <- lm(a[,1] ~ a[,2]+a[,3]+a[,4]+a[,5]+a[,6]+a[,7]+a[,8]+a[,9])
# summary
sfit <- summary(fit)
# summary r.squared, also called explained variance
rsquare <- sfit$r.squared
write.table(paste("Microbiota & Genotype",rsquare,sep = " "),file = "1f.contributionsFieldI.txt", append = TRUE, quote = F, sep=" ", na = "NA", dec = ".", row.names = F, col.names = F)
fit <- lm(a[,1] ~ a[,2]+a[,3]+a[,4]+a[,5])
sfit <- summary(fit)
rsquare <- sfit$r.squared
write.table(paste("Microbiota",rsquare,sep = " "),file = "1f.contributionsFieldI.txt", append = TRUE, quote = F, sep=" ", na = "NA", dec = ".", row.names = F, col.names = F)
fit <- lm(a[,1] ~ a[,6]+a[,7]+a[,8]+a[,9])
sfit <- summary(fit)
rsquare <- sfit$r.squared
write.table(paste("Genotype",rsquare,sep = " "),file = "1f.contributionsFieldI.txt", append = TRUE, quote = F, sep=" ", na = "NA", dec = ".", row.names = F, col.names = F)